Candace Savonen - CCDL for ALSF
This notebook is sets up the MAF data files for comparison and does some first line analyses
It is the first notebook in this series which addresses issue # 30 in OpenPBTA.
To run this from the command line, use:
Rscript -e "rmarkdown::render('analyses/mutect2-vs-strelka2/01-set-up.Rmd',
clean = TRUE)"
This assumes you are in the top directory of the repository.
# We need maftools - this will be added to the running Docker issue whenever it is up
if (!('maftools' %in% installed.packages())) {
devtools::install_github("PoisonAlien/maftools")
}
if (!('hexbin' %in% installed.packages())) {
install.packages("hexbin")
}
if (!('colorblindr' %in% installed.packages())) {
devtools::install_github("clauswilke/colorblindr")
}
Get magrittr pipe
`%>%` <- dplyr::`%>%`
Path to the symlinked data obtained via bash download-data.sh.
data_dir <- file.path("..", "..", "data")
scratch_dir <- file.path("..", "..", "scratch")
Create output directories in this analysis folder.
if (!dir.exists("results")) {
dir.create("results")
}
if (!dir.exists("plots")) {
dir.create("plots")
}
Running maftools::read.maf takes a lot of computing power and time, so to avoid having to run this for both datasets everytime we want to re-run this notebook or the analyses in the other notebook, I’ve set this up to save the MAF objects as RDS files.
First let’s establish the file paths.
# File paths for the needed files for this analysis
metadata_dir <- file.path(scratch_dir, "metadata_filtered_maf_samples.tsv")
strelka2_dir <- file.path(scratch_dir, "strelka2.RDS")
mutect2_dir <- file.path(scratch_dir, "mutect2.RDS")
Will read in as an maftools object from an RDS file, unless maftools has not been run on them yet. Establish whether the files we need for this already exist before running it again.
If you trying to run the set up step in a Docker container, it will likely be out of memory killed, unless you have ~50GB you can allot to Docker.
Prep the metadata to be used as the clinicalData for maftools it it hasn’t been prepped yet.
# Get a vector of whether these exist
files_needed <- file.exists(metadata_dir, strelka2_dir, mutect2_dir)
Error in file.exists(metadata_dir, strelka2_dir, mutect2_dir) :
object 'metadata_dir' not found
Get gene summaries and write to TSV files.
strelka2_gene_sum <- maftools::getGeneSummary(strelka2) %>%
readr::write_tsv(file.path("results",
"strelka2_gene_summary.tsv"))
mutect2_gene_sum <- maftools::getGeneSummary(mutect2) %>%
readr::write_tsv(file.path("results",
"mutect2_gene_summary.tsv"))
Get sample summaries and write to TSV files.
strelka2_sample_sum <- maftools::getSampleSummary(strelka2) %>%
readr::write_tsv(file.path("results",
"strelka2_sample_summary.tsv"))
mutect2_sample_sum <- maftools::getSampleSummary(mutect2) %>%
readr::write_tsv(file.path("results",
"mutect2_sample_summary.tsv"))
combined_gene <- mutect2_gene_sum %>%
dplyr::full_join(strelka2_gene_sum, by = 'Hugo_Symbol') %>%
reshape2::melt(id = 'Hugo_Symbol') %>%
dplyr::mutate(dataset = as.character(grepl(".x$", variable))) %>%
dplyr::mutate(dataset = dplyr::recode(dataset,
`TRUE` = "mutect2",
`FALSE` = "strelka2")) %>%
dplyr::mutate(variable = gsub(".x$|.y$", "", variable)) %>%
tidyr::spread('dataset', 'value')
Let’s get a correlation test on the genes overall.
cor.test(combined_gene$mutect2, combined_gene$strelka2, method = "spearman")
Cannot compute exact p-value with ties
Spearman's rank correlation rho
data: combined_gene$mutect2 and combined_gene$strelka2
S = 4.8562e+13, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
0.9550336
cor.test(combined_gene$mutect2, combined_gene$strelka2, method = "pearson")
Pearson's product-moment correlation
data: combined_gene$mutect2 and combined_gene$strelka2
t = 563.23, df = 186430, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.7919415 0.7953020
sample estimates:
cor
0.7936278
combined_sample <- mutect2_sample_sum %>%
dplyr::full_join(strelka2_sample_sum, by = 'Tumor_Sample_Barcode') %>%
reshape2::melt(id = 'Tumor_Sample_Barcode') %>%
dplyr::mutate(dataset = as.character(grepl(".x$", variable))) %>%
dplyr::mutate(dataset = dplyr::recode(dataset,
`TRUE` = "mutect2",
`FALSE` = "strelka2")) %>%
dplyr::mutate(variable = gsub(".x$|.y$", "", variable)) %>%
tidyr::spread('dataset', 'value')
Let’s get a correlation test on the genes overall.
cor.test(combined_sample$mutect2, combined_sample$strelka2, method = "spearman")
Cannot compute exact p-value with ties
Spearman's rank correlation rho
data: combined_sample$mutect2 and combined_sample$strelka2
S = 2.6957e+10, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
0.7758976
cor.test(combined_sample$mutect2, combined_sample$strelka2, method = "pearson")
Pearson's product-moment correlation
data: combined_sample$mutect2 and combined_sample$strelka2
t = 750.22, df = 8968, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.9917956 0.9924450
sample estimates:
cor
0.992127
maftools::plotTiTv(maftools::titv(mutect2))
Here we will make these new variables for both Mutect2 and Strelka2 dataset: - Calculate VAF for each - Make a mutation ID by concatenating gene name, allele, tumor ID, and start position - Summarize the biotype variable for whether or not it is a coding gene.
Let’s do this for Strelka2 first.
strelka2_vaf <- strelka2@data %>%
dplyr::mutate(vaf = as.numeric(t_alt_count)/(as.numeric(t_ref_count) +
as.numeric(t_alt_count)),
base_change = paste0(Reference_Allele, ">", Allele),
coding = dplyr::case_when(
BIOTYPE != "protein_coding" ~ "non-coding",
TRUE ~ "protein_coding")) %>%
dplyr::mutate(change = dplyr::case_when(
grepl("^-", base_change) ~ "insertion",
grepl("-$", base_change) ~ "deletion",
nchar(base_change) > 3 ~ "long_change",
TRUE ~ base_change)) %>%
dplyr::mutate(mutation_id = paste0(Hugo_Symbol, "_",
change, "_",
Start_Position, "_",
Tumor_Sample_Barcode),
general_id = paste0(Hugo_Symbol, "_", Tumor_Sample_Barcode)) %>%
dplyr::select(-which(apply(is.na(.), 2, all)))
NAs introduced by coercion
# Take a look at this df
strelka2_vaf
Now we will do the same for MuTect2.
mutect2_vaf <- mutect2@data %>%
dplyr::mutate(vaf = as.numeric(t_alt_count)/(as.numeric(t_ref_count) +
as.numeric(t_alt_count)),
base_change = paste0(Reference_Allele, ">", Allele),
coding = dplyr::case_when(
BIOTYPE != "protein_coding" ~ "non-coding",
TRUE ~ "protein_coding")) %>%
dplyr::mutate(change = dplyr::case_when(
grepl("^-", base_change) ~ "insertion",
grepl("-$", base_change) ~ "deletion",
nchar(base_change) > 3 ~ "long_change",
TRUE ~ base_change)) %>%
dplyr::mutate(mutation_id = paste0(Hugo_Symbol, "_",
change, "_",
Start_Position, "_",
Tumor_Sample_Barcode),
general_id = paste0(Hugo_Symbol, "_", Tumor_Sample_Barcode)) %>%
dplyr::select(-which(apply(is.na(.), 2, all)))
# Take a look at this df
mutect2_vaf
Save to a TSV file.
# Merge these data.frames together
vaf_df <- strelka2_vaf %>%
dplyr::full_join(mutect2_vaf, by = 'mutation_id',
suffix = c(".strelka2", ".mutect2")) %>%
# Make a variable that denotes which dataset it is in.
dplyr::mutate(dataset = dplyr::case_when(
is.na(Allele.mutect2) ~ "strelka2_only",
is.na(Allele.strelka2) ~ "mutect2_only",
TRUE ~ "both")) %>%
readr::write_tsv(file.path("results", "combined_results.tsv"))
Session Info:
sessionInfo()
R version 3.6.1 (2019-07-05)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Mojave 10.14.5
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] parallel stats graphics grDevices utils datasets methods base
other attached packages:
[1] Biobase_2.44.0 BiocGenerics_0.30.0
loaded via a namespace (and not attached):
[1] fs_1.3.1 usethis_1.5.1 devtools_2.1.0 doParallel_1.0.15 RColorBrewer_1.1-2
[6] rprojroot_1.3-2 tools_3.6.1 backports_1.1.4 utf8_1.1.4 R6_2.4.0
[11] lazyeval_0.2.2 colorspace_1.4-1 withr_2.1.2 tidyselect_0.2.5 prettyunits_1.0.2
[16] processx_3.4.1 compiler_3.6.1 VennDiagram_1.6.20 cli_1.1.0 formatR_1.7
[21] desc_1.2.0 pkgmaker_0.27 labeling_0.3 scales_1.0.0 hexbin_1.27.3
[26] readr_1.3.1 callr_3.3.1 NMF_0.21.0 stringr_1.4.0 digest_0.6.20
[31] rmarkdown_1.14 R.utils_2.9.0 base64enc_0.1-3 pkgconfig_2.0.2 htmltools_0.3.6
[36] bibtex_0.4.2 sessioninfo_1.1.1 rlang_0.4.0 rstudioapi_0.10 jsonlite_1.6
[41] dplyr_0.8.3 R.oo_1.22.0 magrittr_1.5 wordcloud_2.6 futile.logger_1.4.3
[46] Matrix_1.2-17 Rcpp_1.0.2 munsell_0.5.0 fansi_0.4.0 R.methodsS3_1.7.1
[51] stringi_1.4.3 yaml_2.2.0 pkgbuild_1.0.4 plyr_1.8.4 grid_3.6.1
[56] crayon_1.3.4 lattice_0.20-38 splines_3.6.1 hms_0.5.0 zeallot_0.1.0
[61] knitr_1.24 ps_1.3.0 pillar_1.4.2 rngtools_1.4 reshape2_1.4.3
[66] codetools_0.2-16 pkgload_1.0.2 futile.options_1.0.1 glue_1.3.1 evaluate_0.14
[71] lambda.r_1.2.3 data.table_1.12.2 remotes_2.1.0 BiocManager_1.30.4 colorblindr_0.1.0
[76] vctrs_0.2.0 foreach_1.4.7 testthat_2.2.1 gtable_0.3.0 purrr_0.3.2
[81] tidyr_0.8.3 assertthat_0.2.1 ggplot2_3.2.1 xfun_0.8 gridBase_0.4-7
[86] xtable_1.8-4 survival_2.44-1.1 tibble_2.1.3 iterators_1.0.12 registry_0.5-1
[91] memoise_1.1.0 maftools_2.0.15 cluster_2.1.0